#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>

#include "imageppm.h"
#include "imagerect.h"

void CUDA_ERR(cudaError_t error){
    if( error != cudaSuccess ) {
        printf("CUDA_ERROR: %s\n", cudaGetErrorString(error) );
        exit(-9);
    }
}

__global__ void smooth(int origemWidth, int origemHeight, unsigned char *origem,
                        unsigned char *destino, int npt, int smooth_size) {

    int i, j, l, center, linhaAtual, ultimoPixel, pos, sum;
    int smooth_s = (smooth_size*2+1)*(smooth_size*2+1);
    
    ultimoPixel = origemWidth*origemHeight;
    
    for (i = 0; i < npt; ++i){
        center = blockIdx.x*blockDim.x*npt + threadIdx.x*npt + i/* + offset*/;

        if (center > ultimoPixel) return;
                
        linhaAtual = center/origemWidth;

        // Soma todos os valores dentro do "quadrado" de smooth, ou seja, se o "quadrado" de smooth for de 5x5
        // ira somar os 25 valores
        sum = 0;
        for (j = -smooth_size; j <= smooth_size; ++j){
            for (l = -smooth_size; l <= smooth_size; ++l){
                pos = center+j + l*origemWidth;
                
                if ( pos >= 0 && pos < ultimoPixel) // garante na vertical
                    if ( pos >= (linhaAtual+l)*origemWidth && pos < (linhaAtual+l)*origemWidth+origemWidth ) // garante na horizotal
                        sum += origem[pos];
            }
        }

        // Calcula o smooth para o ponto atual, ou seja, divide a soma calculada anteriormente pela quantidade de elementos do "quadrado" de smooth
        // e por fim, armazena esse valor em uma "nova imagem"
        destino[center] = sum/smooth_s;
    }
}

Image * sendImage ( Image *img, int smooth_size, float *loadTime, float *execTime, float *storeTime, int nthreads ) {
    cudaEvent_t start, exec, execEnd, store;
    cudaDeviceProp prop;

    ImageRect *imgRect[3];
    
    int channelSize = img->width * img->height;

    // cudaGetDeviceCount(&count);
    cudaGetDeviceProperties(&prop, 0);
    
    CUDA_ERR(cudaEventCreate(&start));
    CUDA_ERR(cudaEventCreate(&exec));
    CUDA_ERR(cudaEventCreate(&execEnd));
    CUDA_ERR(cudaEventCreate(&store));

    imgRect[0] = CreateImageRect(img, COLOR_RED, 0, 0, img->width, img->height);
    imgRect[1] = CreateImageRect(img, COLOR_GREEN, 0, 0, img->width, img->height);
    imgRect[2] = CreateImageRect(img, COLOR_BLUE, 0, 0, img->width, img->height);
    
    unsigned char *imgRGB[3];    
    unsigned char *imgRGBSmoothed[3];    

    // Alocar memoria
    CUDA_ERR(cudaMalloc( (void**)&(imgRGB[0]), channelSize));
    CUDA_ERR(cudaMalloc( (void**)&(imgRGB[1]), channelSize));
    CUDA_ERR(cudaMalloc( (void**)&(imgRGB[2]), channelSize));

    CUDA_ERR(cudaMalloc( (void**)&(imgRGBSmoothed[0]), channelSize));
    CUDA_ERR(cudaMalloc( (void**)&(imgRGBSmoothed[1]), channelSize));
    CUDA_ERR(cudaMalloc( (void**)&(imgRGBSmoothed[2]), channelSize));

    CUDA_ERR(cudaEventRecord(start));
    
    CUDA_ERR(cudaMemcpy((void*)imgRGB[0], imgRect[0]->data, channelSize, cudaMemcpyHostToDevice));
    CUDA_ERR(cudaMemcpy((void*)imgRGB[1], imgRect[1]->data, channelSize, cudaMemcpyHostToDevice));
    CUDA_ERR(cudaMemcpy((void*)imgRGB[2], imgRect[2]->data, channelSize, cudaMemcpyHostToDevice));

	if (nthreads == -1)
		nthreads = prop.maxThreadsPerBlock;
    int pixelsPorThread = 8;
    int nBlocos = (channelSize/nthreads)/pixelsPorThread
        + (channelSize%nthreads != 0 || ((int)(channelSize/nthreads))%pixelsPorThread != 0 );
        
    printf("Blocos: %d; Threads: %d; PpT: %d\n", nBlocos, nthreads, pixelsPorThread);
        
    cudaEventRecord(exec);
    
    // Executar
    smooth<<<nBlocos,nthreads>>>(img->width, img->height, imgRGB[0], imgRGBSmoothed[0], pixelsPorThread, smooth_size);
    smooth<<<nBlocos,nthreads>>>(img->width, img->height, imgRGB[1], imgRGBSmoothed[1], pixelsPorThread, smooth_size);
    smooth<<<nBlocos,nthreads>>>(img->width, img->height, imgRGB[2], imgRGBSmoothed[2], pixelsPorThread, smooth_size);
    
    CUDA_ERR(cudaPeekAtLastError());
    CUDA_ERR(cudaEventRecord(execEnd));
    
    // passar de volta para a cpu
    CUDA_ERR(cudaMemcpy((void*)imgRect[0]->data, imgRGBSmoothed[0], channelSize, cudaMemcpyDeviceToHost));
    CUDA_ERR(cudaMemcpy((void*)imgRect[1]->data, imgRGBSmoothed[1], channelSize, cudaMemcpyDeviceToHost));
    CUDA_ERR(cudaMemcpy((void*)imgRect[2]->data, imgRGBSmoothed[2], channelSize, cudaMemcpyDeviceToHost));
    
    cudaEventRecord(store);
    
    CUDA_ERR(cudaFree(imgRGB[0]));
    CUDA_ERR(cudaFree(imgRGB[1]));
    CUDA_ERR(cudaFree(imgRGB[2]));

    CUDA_ERR(cudaFree(imgRGBSmoothed[0]));
    CUDA_ERR(cudaFree(imgRGBSmoothed[1]));
    CUDA_ERR(cudaFree(imgRGBSmoothed[2]));
    
    Image *ret = (Image*) malloc(sizeof(Image));
    ret->r = imgRect[0]->data;
    ret->g = imgRect[1]->data;
    ret->b = imgRect[2]->data;
    ret->width = img->width;
    ret->height = img->height;
    ret->colorsize = img->colorsize;
    
    free(imgRect[0]);
    free(imgRect[1]);
    free(imgRect[2]);
    
    cudaEventSynchronize(store);
    *loadTime = 0, *execTime = 0, *storeTime = 0;
    cudaEventElapsedTime(loadTime, start, exec);
    cudaEventElapsedTime(execTime, exec, execEnd);
    cudaEventElapsedTime(storeTime, execEnd, store);
    
    return ret;
}


/*
	argv[1] = Image in
	argv[2] = Image out
*/
int main(int argc, char *argv[] ) {

    struct timeval start_all, end_all;
    struct timeval end_load, start_ret;
    float loadTime, execTime, storeTime;

	int threads = atoi(argv[3]);
	
    gettimeofday(&start_all, NULL);

    // LOAD START

    Image *img, *smoothedImg;
    img = load_image(argv[1]);

    // LOAD END

    gettimeofday(&end_load, NULL);

    // START PROCESSING

    smoothedImg = sendImage(img, 2, &loadTime, &execTime, &storeTime, threads);

    // END PROCESSING

    gettimeofday(&start_ret, NULL);

    // START RETURNING RESULT

    save_image(smoothedImg, argv[2], SAVE_MODE_BINARY);

    release_image(img);
    release_image(smoothedImg);

    gettimeofday(&end_all, NULL);

    // END RESULT

    double result_all = end_all.tv_sec - start_all.tv_sec + (end_all.tv_usec - start_all.tv_usec) / 1000000.0;
    double result_load = end_load.tv_sec - start_all.tv_sec + (end_load.tv_usec - start_all.tv_usec) / 1000000.0;
    double result_ret = end_all.tv_sec - start_ret.tv_sec + (end_all.tv_usec - start_ret.tv_usec) / 1000000.0;
    printf("%lf %lf %lf %f %f %f\n", result_load, result_ret, loadTime/1000.0, execTime/1000.0, storeTime/1000.0, result_all);

    return 0;
}
